In [5]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd
import glob
import json
from sklearn import cluster
from scipy.cluster import hierarchy
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

Covid19 Dataset Download URL: https:// drive.google.com/file/d/1IC0s9QoBLWFN9tRI-z2QbJJWgngfAm8w/view?usp=sharing (Filename: CORD-19-research-challenge.zip, File size: 1.58 GB)¶

update the root_path value according to your dataset location¶

In [6]:
root_path = 'Experiment3/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
In [7]:
meta_df.head()
Out[7]:
cord_uid sha source_x title doi pmcid pubmed_id license abstract publish_time authors journal Microsoft Academic Paper ID WHO #Covidence has_pdf_parse has_pmc_xml_parse full_text_file url
0 xqhn0vbp 1e1286db212100993d03cc22374b624f7caee956 PMC Airborne rhinovirus detection and effect of ul... 10.1186/1471-2458-3-5 PMC140314 12525263 no-cc BACKGROUND: Rhinovirus, the most common cause ... 2003-01-13 Myatt, Theodore A; Johnston, Sebastian L; Rudn... BMC Public Health NaN NaN True True custom_license https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1 gi6uaa83 8ae137c8da1607b3a8e4c946c07ca8bda67f88ac PMC Discovering human history from stomach bacteria 10.1186/gb-2003-4-5-213 PMC156578 12734001 no-cc Recent analyses of human pathogens have reveal... 2003-04-28 Disotell, Todd R Genome Biol NaN NaN True True custom_license https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2 le0ogx1s NaN PMC A new recruit for the army of the men of death 10.1186/gb-2003-4-7-113 PMC193621 12844350 no-cc The army of the men of death, in John Bunyan's... 2003-06-27 Petsko, Gregory A Genome Biol NaN NaN False True custom_license https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3 fy4w7xz8 0104f6ceccf92ae8567a0102f89cbb976969a774 PMC Association of HLA class I with severe acute r... 10.1186/1471-2350-4-9 PMC212558 12969506 no-cc BACKGROUND: The human leukocyte antigen (HLA) ... 2003-09-12 Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean... BMC Med Genet NaN NaN True True custom_license https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4 0qaoam29 5b68a553a7cbbea13472721cd1ad617d42b40c26 PMC A double epidemic model for the SARS propagation 10.1186/1471-2334-3-19 PMC222908 12964944 no-cc BACKGROUND: An epidemic of a Severe Acute Resp... 2003-09-10 Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine BMC Infect Dis NaN NaN True True custom_license https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
In [8]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)
Out[8]:
59311
In [9]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                    self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                    self.body_text.append(entry['text'])
#             if 'abstract' in content:
#                 for entry in content['abstract']:
#                     self.abstract.append(entry['text'])
#             # Body text
#             if 'body_text' in content:
#                 for entry in content['body_text']:
#                     self.body_text.append(entry['text'])
                    
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
    
# Helper function adds break after every words when character length reach to certain amount. This is for the interactive plot so that hover tool fits the screen.
    
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data
In [10]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
#     if idx % (len(all_json) // 10000) == 0:
#         print(idx)
    try:
        if idx % (len(all_json) // 10) == 0:
            print(f'Processing index: {idx} of {len(all_json)}')
        content = FileReader(entry)

        # get metadata information
        meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
        # no metadata, skip this paper
        if len(meta_data) == 0:
            continue

        dict_['paper_id'].append(content.paper_id)
        dict_['abstract'].append(content.abstract)
        dict_['body_text'].append(content.body_text)

        # also create a column for the summary of abstract to be used in a plot
        if len(content.abstract) == 0: 
            # no abstract provided
            dict_['abstract_summary'].append("Not provided.")
        elif len(content.abstract.split(' ')) > 100:
            # abstract provided is too long for plot, take first 300 words append with ...
            info = content.abstract.split(' ')[:100]
            summary = get_breaks(' '.join(info), 40)
            dict_['abstract_summary'].append(summary + "...")
        else:
            # abstract is short enough
            summary = get_breaks(content.abstract, 40)
            dict_['abstract_summary'].append(summary)

        # get metadata information
        meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

        try:
            # if more than one author
            authors = meta_data['authors'].values[0].split(';')
            if len(authors) > 2:
                # more than 2 authors, may be problem when plotting, so take first 2 append with ...
                dict_['authors'].append(". ".join(authors[:2]) + "...")
            else:
                # authors will fit in plot
                dict_['authors'].append(". ".join(authors))
        except Exception as e:
            # if only one author - or Null valie
            dict_['authors'].append(meta_data['authors'].values[0])

        # add the title information, add breaks when needed
        try:
            title = get_breaks(meta_data['title'].values[0], 40)
            dict_['title'].append(title)
        # if title was not provided
        except Exception as e:
            dict_['title'].append(meta_data['title'].values[0])

        # add the journal information
        dict_['journal'].append(meta_data['journal'].values[0])
        
    
    except Exception as e:
        continue
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
Processing index: 0 of 59311
Processing index: 5931 of 59311
Processing index: 11862 of 59311
Processing index: 17793 of 59311
Processing index: 23724 of 59311
Processing index: 29655 of 59311
Processing index: 35586 of 59311
Processing index: 41517 of 59311
Processing index: 47448 of 59311
Processing index: 53379 of 59311
Processing index: 59310 of 59311
In [11]:
dict_ = None
df_covid.describe()
Out[11]:
paper_id abstract body_text authors title journal abstract_summary
count 36009 36009 36009 35413 35973 34277 36009
unique 36009 26249 35981 33538 35652 5410 26239
top 4ed70c27f14b7f9e6219fe605eae2b21a229f23c In previous reports, workers have characterize... Domingo, Esteban In the Literature PLoS One Not provided.
freq 1 9704 3 14 9 1518 9704
In [12]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
In [13]:
df_covid.dropna(inplace=True)
df_covid = df_covid[df_covid.abstract != ''] #Remove rows which are missing abstracts
df_covid = df_covid[df_covid.body_text != ''] #Remove rows which are missing body_text
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True) # remove duplicate rows having same abstract and body_text

After handling duplicates, finding the the count, mean, standard deviation minimum, and maximum values for the abstract word count and body word count¶

In [14]:
metrics = ['count', 'mean', 'std', 'min', 'max']
abstract_word_count_stat = df_covid.describe(include='all')['abstract_word_count']
body_word_count_stat = df_covid.describe(include='all')['body_word_count']

print('Abstract Word Count')
for metric in metrics:
    print('{} : {:.4f}'.format(metric, abstract_word_count_stat[metric]))

print('\nBody Word Count')
for metric in metrics:
    print('{} : {:.4f}'.format(metric, body_word_count_stat[metric]))
Abstract Word Count
count : 24584.0000
mean : 216.4467
std : 137.0651
min : 1.0000
max : 3694.0000

Body Word Count
count : 24584.0000
mean : 4435.4751
std : 3657.4214
min : 23.0000
max : 232431.0000
In [15]:
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
In [16]:
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))
In [17]:
text = df_covid.drop(["paper_id", "abstract", "abstract_word_count", "body_word_count", "authors", "title", "journal", "abstract_summary"], axis=1)
text_arr = text.stack().tolist()
len(text_arr)
Out[17]:
24584
In [18]:
words = []
for ii in range(0,len(text)):
    words.append(str(text.iloc[ii]['body_text']).split(" "))
In [19]:
n_gram_all = []

for word in words:
    # get n-grams for the instance
    n_gram = []
    for i in range(len(word)-2+1):
        n_gram.append("".join(word[i:i+2]))
    n_gram_all.append(n_gram)
In [20]:
from sklearn.feature_extraction.text import HashingVectorizer

# hash vectorizer instance
hvec = HashingVectorizer(lowercase=False, analyzer=lambda l:l, n_features=2**12)

# features matrix X
X = hvec.fit_transform(n_gram_all)
In [21]:
# Following cell may take 20-30 minutes to run
from sklearn.manifold import TSNE

#tsne = TSNE(verbose=1)
tsne = TSNE(verbose=1, perplexity=5)
X_embedded = tsne.fit_transform(X.toarray())
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 24584 samples in 0.313s...
[t-SNE] Computed neighbors for 24584 samples in 52.697s...
[t-SNE] Computed conditional probabilities for sample 1000 / 24584
[t-SNE] Computed conditional probabilities for sample 2000 / 24584
[t-SNE] Computed conditional probabilities for sample 3000 / 24584
[t-SNE] Computed conditional probabilities for sample 4000 / 24584
[t-SNE] Computed conditional probabilities for sample 5000 / 24584
[t-SNE] Computed conditional probabilities for sample 6000 / 24584
[t-SNE] Computed conditional probabilities for sample 7000 / 24584
[t-SNE] Computed conditional probabilities for sample 8000 / 24584
[t-SNE] Computed conditional probabilities for sample 9000 / 24584
[t-SNE] Computed conditional probabilities for sample 10000 / 24584
[t-SNE] Computed conditional probabilities for sample 11000 / 24584
[t-SNE] Computed conditional probabilities for sample 12000 / 24584
[t-SNE] Computed conditional probabilities for sample 13000 / 24584
[t-SNE] Computed conditional probabilities for sample 14000 / 24584
[t-SNE] Computed conditional probabilities for sample 15000 / 24584
[t-SNE] Computed conditional probabilities for sample 16000 / 24584
[t-SNE] Computed conditional probabilities for sample 17000 / 24584
[t-SNE] Computed conditional probabilities for sample 18000 / 24584
[t-SNE] Computed conditional probabilities for sample 19000 / 24584
[t-SNE] Computed conditional probabilities for sample 20000 / 24584
[t-SNE] Computed conditional probabilities for sample 21000 / 24584
[t-SNE] Computed conditional probabilities for sample 22000 / 24584
[t-SNE] Computed conditional probabilities for sample 23000 / 24584
[t-SNE] Computed conditional probabilities for sample 24000 / 24584
[t-SNE] Computed conditional probabilities for sample 24584 / 24584
[t-SNE] Mean sigma: 0.126564
[t-SNE] KL divergence after 250 iterations with early exaggeration: 147.965591
[t-SNE] KL divergence after 1000 iterations: 4.554127
In [22]:
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", 1)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)

plt.title("t-SNE Covid-19 Articles")
# plt.savefig("plots/t-sne_covid19.png")
plt.show()
In [23]:
from sklearn.cluster import MiniBatchKMeans

k = 10
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
In [24]:
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", len(set(y_pred)))

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered")
# plt.savefig("plots/t-sne_covid19_label.png")
plt.show()

Using K=20 now¶

In [25]:
k = 20
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", len(set(y_pred)))

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered")
# plt.savefig("plots/t-sne_covid19_label.png")
plt.show()

By using tf-idf vectorizer and plain text features instead of 2-grams, we could see that the clusters obtained from K-means clustering (with K = 10) are more separable in the t-SNE plot. Now let's try to apply the tf-idf vectorizer on the 2-gram representation of documents instead of plain text, and then apply K-means clustering with K = 10.¶

In [26]:
# prepare n_gram_list
n_gram_list = []
for i in n_gram_all:
    n_gram_list.append(" ".join(i))
In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2**12)
X = vectorizer.fit_transform(n_gram_list)
In [28]:
from sklearn.cluster import MiniBatchKMeans

k = 10
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
y = y_pred
In [29]:
# Following cell will take 20-30 minutes to run
from sklearn.manifold import TSNE

#tsne = TSNE(verbose=1)
tsne = TSNE(verbose=1, perplexity=5)
X_embedded = tsne.fit_transform(X.toarray())
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 24584 samples in 0.070s...
[t-SNE] Computed neighbors for 24584 samples in 49.610s...
[t-SNE] Computed conditional probabilities for sample 1000 / 24584
[t-SNE] Computed conditional probabilities for sample 2000 / 24584
[t-SNE] Computed conditional probabilities for sample 3000 / 24584
[t-SNE] Computed conditional probabilities for sample 4000 / 24584
[t-SNE] Computed conditional probabilities for sample 5000 / 24584
[t-SNE] Computed conditional probabilities for sample 6000 / 24584
[t-SNE] Computed conditional probabilities for sample 7000 / 24584
[t-SNE] Computed conditional probabilities for sample 8000 / 24584
[t-SNE] Computed conditional probabilities for sample 9000 / 24584
[t-SNE] Computed conditional probabilities for sample 10000 / 24584
[t-SNE] Computed conditional probabilities for sample 11000 / 24584
[t-SNE] Computed conditional probabilities for sample 12000 / 24584
[t-SNE] Computed conditional probabilities for sample 13000 / 24584
[t-SNE] Computed conditional probabilities for sample 14000 / 24584
[t-SNE] Computed conditional probabilities for sample 15000 / 24584
[t-SNE] Computed conditional probabilities for sample 16000 / 24584
[t-SNE] Computed conditional probabilities for sample 17000 / 24584
[t-SNE] Computed conditional probabilities for sample 18000 / 24584
[t-SNE] Computed conditional probabilities for sample 19000 / 24584
[t-SNE] Computed conditional probabilities for sample 20000 / 24584
[t-SNE] Computed conditional probabilities for sample 21000 / 24584
[t-SNE] Computed conditional probabilities for sample 22000 / 24584
[t-SNE] Computed conditional probabilities for sample 23000 / 24584
[t-SNE] Computed conditional probabilities for sample 24000 / 24584
[t-SNE] Computed conditional probabilities for sample 24584 / 24584
[t-SNE] Mean sigma: 0.171953
[t-SNE] KL divergence after 250 iterations with early exaggeration: 156.275116
[t-SNE] KL divergence after 1000 iterations: 5.079303
In [30]:
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", len(set(y)))

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered(K-Means) - Tf-idf with 2-Gram")
plt.show()

Interactive Plot¶

In [31]:
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from bokeh.io import output_file, show
from bokeh.transform import transform
from bokeh.io import output_notebook
from bokeh.plotting import figure
from bokeh.layouts import column
from bokeh.models import RadioButtonGroup
from bokeh.models import TextInput
from bokeh.layouts import gridplot
from bokeh.models import Div
from bokeh.models import Paragraph
from bokeh.layouts import column, widgetbox

output_notebook()
y_labels = y_pred

# data sources
source = ColumnDataSource(data=dict(
    x= X_embedded[:,0], 
    y= X_embedded[:,1],
    x_backup = X_embedded[:,0],
    y_backup = X_embedded[:,1],
    desc= y_labels, 
    titles= df_covid['title'],
    authors = df_covid['authors'],
    journal = df_covid['journal'],
    abstract = df_covid['abstract_summary'],
    labels = ["C-" + str(x) for x in y_labels]
    ))

# hover over information
hover = HoverTool(tooltips=[
    ("Title", "@titles{safe}"),
    ("Author(s)", "@authors"),
    ("Journal", "@journal"),
    ("Abstract", "@abstract{safe}"),
],
                 point_policy="follow_mouse")

# map colors
mapper = linear_cmap(field_name='desc', 
                     palette=Category20[20],
                     low=min(y_labels) ,high=max(y_labels))

# prepare the figure
p = figure(plot_width=800, plot_height=800, 
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'], 
           title="t-SNE Covid-19 Articles, Clustered(K-Means), Tf-idf with Plain Text", 
           toolbar_location="right")

# plot
p.scatter('x', 'y', size=5, 
          source=source,
          fill_color=mapper,
          line_alpha=0.3,
          line_color="black",
          legend = 'labels')

# add callback to control 
callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var radio_value = cb_obj.active;
            var data = source.data; 
            
            var x = data['x'];
            var y = data['y'];
            
            var x_backup = data['x_backup'];
            var y_backup = data['y_backup'];
            
            var labels = data['desc'];
            
            if (radio_value == '20') {
                for (i = 0; i < x.length; i++) {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                }
            }
            else {
                for (var i = 0; i < x.length; i++) {
                    if(labels[i] == radio_value) {
                        x[i] = x_backup[i];
                        y[i] = y_backup[i];
                    } else {
                        x[i] = undefined;
                        y[i] = undefined;
                    }
                }
            }


        source.change.emit();
        """)

# callback for searchbar
keyword_callback = CustomJS(args=dict(p=p, source=source), code="""
            
            var text_value = cb_obj.value;
            var data = source.data; 
            
            var x = data['x'];
            var y = data['y'];
            
            var x_backup = data['x_backup'];
            var y_backup = data['y_backup'];
            
            var abstract = data['abstract'];
            var titles = data['titles'];
            var authors = data['authors'];
            var journal = data['journal'];

            for (var i = 0; i < x.length; i++) {
                if(abstract[i].includes(text_value) || 
                   titles[i].includes(text_value) || 
                   authors[i].includes(text_value) || 
                   journal[i].includes(text_value)) {
                    x[i] = x_backup[i];
                    y[i] = y_backup[i];
                } else {
                    x[i] = undefined;
                    y[i] = undefined;
                }
            }
            


        source.change.emit();
        """)

# option
option = RadioButtonGroup(labels=["C-0", "C-1", "C-2",
                                  "C-3", "C-4", "C-5",
                                  "C-6", "C-7", "C-8",
                                  "C-9", "C-10", "C-11",
                                  "C-12", "C-13", "C-14",
                                  "C-15", "C-16", "C-17",
                                  "C-18", "C-19", "All"], 
                          active=20)
option.js_on_click(callback)

# search box
keyword = TextInput(title="Search:",)
keyword.js_on_change('value', keyword_callback)

#header
header = Div(text="""<h1>COVID-19 Literature Cluster</h1>""")

# show
show(column(header, widgetbox(option, keyword),p))
Loading BokehJS ...
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
BokehDeprecationWarning: 'WidgetBox' is deprecated and will be removed in Bokeh 3.0, use 'bokeh.models.Column' instead